import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.api as sc
import scrublet as scr
import seaborn as sns
import scipy.stats
import anndata
import os
import scipy as scipy
import scipy as sp
import pickle as pkl
import matplotlib.pyplot as plt
import re
from collections import defaultdict
from statsmodels.nonparametric.smoothers_lowess import lowess
from numpy import asarray as ar
from collections import Counter
import networkx as nx
import igraph
import glob
import sys
sys.path.append("/mnt/00_Tools/")
from jp_single import *
from bbknn import bbknn
os.getcwd()
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.logging.print_version_and_date()
%load_ext autoreload
%autoreload 2
result_file = './write/adata_PIP-B_T-all.h5ad'
sys.path.append("/mnt/thymus_atlas")
import scjp
The R object was converted to .h5ad format using the sceasy package: https://github.com/cellgeni/sceasy
adata = sc.read('/mnt/COVID-19/cpm.h5ad')
adata.uns
adata.obs.columns
adata.obs.head(1)
sc.pl.umap(adata, color='manual_celltype')
sc.pl.umap(adata, color='predictedCluster')
adata.obsm
Rik has saved several layers of the data but recommends to use the harmony corrected layer
adata.obsm['X_umap'] = adata.obsm['X_umapafterharmony_adt']
sc.pl.umap(adata, color='predictedCluster')
lr = pkl.load(open('/mnt/PIP/celltypist/manual_models/Immune_v5_allData_highest_all.pkl','rb'))
CT_genes = pd.read_csv('/mnt/PIP/celltypist/manual_models/HumanAtlas_inters_ptprc_plus_genes.csv', header=None)
CT_genes.head(1)
CT_genes['idx'] = CT_genes.index
CT_genes.head(1)
CT_genes.columns = ['symbol', 'idx']
CT_genes = np.array(CT_genes['symbol'])
lr['Model'].features = CT_genes
lr = lr['Model']
#features is the gene names of the dataset to be annotated
features = adata.var_names
lr.features
k_x = features.isin(list(lr.features))
print(f'{k_x.sum()} features used for prediction', file=sys.stderr)
k_x = features.isin(list(lr.features))
print(f'{k_x.sum()} features used for prediction', file=sys.stderr)
k_x_idx = np.where(k_x)[0]
X = adata.X[:, k_x_idx]
features = features[k_x]
ad_ft = pd.DataFrame(features.values, columns=['ad_features']).reset_index().rename(columns={'index': 'ad_idx'})
lr_ft = pd.DataFrame(lr.features, columns=['lr_features']).reset_index().rename(columns={'index': 'lr_idx'})
lr_idx = lr_ft.merge(ad_ft, left_on='lr_features', right_on='ad_features').sort_values(by='ad_idx').lr_idx.values
lr.n_features_in_ = lr_idx.size
lr.features = lr.features[lr_idx]
lr.coef_ = lr.coef_[:, lr_idx]
predicted_hi = lr.predict(X)
predicted_hi.shape
adata.obs['predicted_hi'] = predicted_hi
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.pl.umap(adata, color = 'predicted_hi')
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.pl.umap(adata, color = 'predicted_hi', groups=['B cells','ILC','Monocytes', 'T cells', 'DC'])
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.pl.umap(adata, color = 'predicted_hi')
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.pl.umap(adata, color = 'manual_celltype')
lr = pkl.load(open('/mnt/PIP/celltypist/manual_models/Immune_v5_allData_lowest_all.pkl','rb'))
CT_genes = pd.read_csv('/mnt/PIP/celltypist/manual_models/HumanAtlas_inters_ptprc_plus_genes.csv', header=None)
CT_genes.head(1)
CT_genes['idx'] = CT_genes.index
CT_genes.head(1)
CT_genes.columns = ['symbol', 'idx']
CT_genes = np.array(CT_genes['symbol'])
lr['Model'].features = CT_genes
lr = lr['Model']
#features is the gene names of the dataset to be annotated
features = adata.var_names
lr.features
k_x = features.isin(list(lr.features))
print(f'{k_x.sum()} features used for prediction', file=sys.stderr)
k_x = features.isin(list(lr.features))
print(f'{k_x.sum()} features used for prediction', file=sys.stderr)
k_x_idx = np.where(k_x)[0]
X = adata.X[:, k_x_idx]
features = features[k_x]
ad_ft = pd.DataFrame(features.values, columns=['ad_features']).reset_index().rename(columns={'index': 'ad_idx'})
lr_ft = pd.DataFrame(lr.features, columns=['lr_features']).reset_index().rename(columns={'index': 'lr_idx'})
lr_idx = lr_ft.merge(ad_ft, left_on='lr_features', right_on='ad_features').sort_values(by='ad_idx').lr_idx.values
lr.n_features_in_ = lr_idx.size
lr.features = lr.features[lr_idx]
lr.coef_ = lr.coef_[:, lr_idx]
predicted_lo = lr.predict(X)
predicted_lo.shape
adata.obs['predicted_lo'] = predicted_lo
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.pl.umap(adata, color = 'predicted_lo')
value_counts = adata.obs['predicted_lo'].value_counts()
type(value_counts)
value_counts = pd.DataFrame(value_counts)
value_counts.head(1)
value_counts.columns
value_counts_2 = value_counts[~(value_counts['predicted_lo'] <= 100)]
value_counts_2
ct_predicted = value_counts_2.index
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.pl.umap(adata, color = 'predicted_lo', groups=['Tcm/Naive helper-T cells','Tcm/Naive cytotoxic-T cells','T cells','NK cells','cytotoxic-T cells','memory CD4+ cytotoxic-T cells','follicular B cells','B cells','memory-B cells','ILC','Monocytes', 'DC2'])
ct_predicted = list(ct_predicted)
sc.settings.set_figure_params(dpi=100, color_map='Blues')
sc.pl.umap(adata, color = 'predicted_lo', groups=ct_predicted)
adata.write('/mnt/COVID-19/covid19.h5ad')
matplotlib.rcParams.update({'figure.figsize': (25, 10)})
matplotlib.rc('xtick', labelsize=8)
crosstab = pd.crosstab(adata.obs.predicted_hi, adata.obs.predictedCluster).plot(kind='bar', stacked=True)
freq = pd.crosstab(adata.obs.predicted_hi, adata.obs.predictedCluster)
freq = freq.apply(lambda x: 1 * x / x.sum())
rc={'ytick.labelsize': 8}
sns.set(rc=rc)
sns.clustermap(freq, xticklabels=True, yticklabels=True)
matplotlib.rcParams.update({'figure.figsize': (25, 10)})
matplotlib.rc('xtick', labelsize=8)
crosstab = pd.crosstab(adata.obs.predicted_lo, adata.obs.predictedCluster).plot(kind='bar', stacked=True)
freq = pd.crosstab(adata.obs.predicted_lo, adata.obs.predictedCluster)
freq = freq.apply(lambda x: 1 * x / x.sum())
rc={'ytick.labelsize': 8}
sns.set(rc=rc)
sns.clustermap(freq, xticklabels=True, yticklabels=True)